- 样本模拟
- K-means聚类
- 层次聚类
- 其他
2017-06-18
x=rnorm(1500,mean=25,sd=4.5); y=rnorm(1500,mean=16,sd=4.5); dfnormal=data.frame(cc=1,x=x,y=y); x=runif(400,min=2,max=13); y=runif(400,min=15,max=24); dfnormal=rbind(dfnormal,data.frame(cc=2,x=x,y=y)); x=rnorm(200,mean=8,sd=1.5); y=rnorm(200,mean=8,sd=1.5); dfnormal=rbind(dfnormal,data.frame(cc=3,x=x,y=y)); #plot(dfnormal$x,dfnormal$y,col=dfnormal$cc) plot(y~x,col=cc,dfnormal)
getcircle <- function(np,np2,cc,r,nsd){ #np=800;np2=100;r=25,nsd=2
x=c(runif(np,min=-r,max=r),
runif(np2,min=-r,max=-r+1),
runif(np2,min=r-1,max=r))
y=sign(rnorm(np+np2*2))*sqrt(r^2-x^2)+rnorm(np+np2*2,mean=2,sd=nsd);
x=x+rnorm(np+np2*2,mean=2,sd=nsd);
return(data.frame(cc=cc,x=x,y=y))
}
dfcircle=getcircle(800,100,1,25,2)
dfcircle=rbind(dfcircle,getcircle(300,50,2,13,1.6))
dfcircle=rbind(dfcircle,getcircle(50,10,3,3,1))
#plot(dfcircle$x,dfcircle$y,col=factor(dfcircle$cc))
plot(y~x,col=factor(cc),dfcircle)
#清空原分类 mdata=dfnormal mdata$cc=NULL #执行kmeans cc=kmeans(mdata,5) ccout=fitted(cc) #查看聚类结果 table(dfnormal$cc, cc$cluster) # 1 2 3 #1 #2 #3 plot(y~x,col=rownames(ccout),dfnormal)
#清空原分类 mdata=dfcircle mdata$cc=NULL #执行kmeans cc=kmeans(mdata,3) ccout=fitted(cc) #查看聚类结果 table(dfcircle, cc$cluster) # 1 2 3 #1 #2 #3 plot(y~x,col=rownames(ccout),dfcircle)
#清空原分类 mdata=dfnormal mdata$cc=NULL #执行hclust hc = hclust(dist(mdata), "ave") plot(hc) #分类 ccout = cutree(hc, k = 5) plot(y~x,col=ccout,dfnormal)
plot(hc)
dend1 = as.dendrogram(hc) str(dend1, max = 2)
## --[dendrogram w/ 2 branches and 2100 members at h = 18.4] ## |--[dendrogram w/ 2 branches and 665 members at h = 12.5] ## | |--[dendrogram w/ 2 branches and 210 members at h = 8.45] .. ## | `--[dendrogram w/ 2 branches and 455 members at h = 12.2] .. ## `--[dendrogram w/ 2 branches and 1435 members at h = 14.3] ## |--[dendrogram w/ 2 branches and 6 members at h = 5.17] .. ## `--[dendrogram w/ 2 branches and 1429 members at h = 9.28] ..
#清空原分类 mdata=dfcircle mdata$cc=NULL #执行hclust hc = hclust(dist(mdata), "ave") plot(hc) #分类 ccout = cutree(hc, k = 3) plot(y~x,col=ccout,dfcircle)
hc = hclust(dist(mdata), "single")
[1]: 数据之城,2012,"聚类算法实践",http://hi.baidu.com/sky88088
[2]: Manuel Fernández-Delgado,Eva Cernadas,etc,2014,"Do we Need Hundreds of Classifiers to Solve Real World Classification Problems?",http://jmlr.org/papers/v15/delgado14a.html